import io
import sys
import importlib
import jieba

filename = './data/text2.txt'
cut_file = './data/text2.txt_cut'


# 此函数作用是对初始语料进行分词处理后，作为训练模型的语料
def cut_txt(old_file, cut1_file):
    print('cut_txt begin')
    try:
        fi = io.open(old_file, 'r', encoding='utf-8')
        text = fi.read()  # 获取文本内容

        # cut word 切词
        new_text = jieba.cut(text, cut_all=False)
        str_out = ' '.join(new_text).replace('，', '').replace('。', '').replace('？', '').replace('！', '').\
            replace('“', '').replace('”', '').replace('：', '').replace('…', '').replace('（', '').replace('）', '').\
            replace('—', '').replace('《', '').replace('》', '').replace('、', '').replace('‘', '').replace('’', '')

        # write to cut_file

        fo = io.open(cut1_file, 'w', encoding='utf-8')
        fo.write(str_out)
    except BaseException as e:
        print(Exception, ":", e)
        print('cut_txt end')


if __name__ == "__main__":
        cut_txt(file_name, cut_file)





